library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
theme_set(theme_bw(base_size = 12))
setwd("~/Documents/git/popgen-project/")

First, I load the data and explore

df <- read_tsv("ArchaicAdmixture/ArchaicSegments.txt") %>% 
  mutate(length = end-start+1000) %>% 
  filter(length>0)
## Rows: 524714 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): name, chrom, pop, country, region, outgroup, method
## dbl (8): start, end, length, snps, MeanProb, Shared_with_Altai, Shared_with_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df) %>% knitr::kable()
name chrom start end length snps pop country region MeanProb Shared_with_Altai Shared_with_Denisova Shared_with_Vindija outgroup method
ERS699811 1 1562000 1603000 42000 5 Toto India EastAsia 0.5572001 0 0 0 SubAfricans HMM
ERS699811 1 2791000 2815000 25000 12 Toto India EastAsia 0.8304250 7 0 5 SubAfricans HMM
ERS699811 1 3058000 3113000 56000 36 Toto India EastAsia 0.9733212 10 6 12 SubAfricans HMM
ERS699811 1 3341000 3384000 44000 21 Toto India EastAsia 0.9102532 9 1 8 SubAfricans HMM
ERS699811 1 3433000 3453000 21000 13 Toto India EastAsia 0.9437119 8 0 7 SubAfricans HMM
ERS699811 1 3579000 3601000 23000 10 Toto India EastAsia 0.7721082 2 0 3 SubAfricans HMM
summary(df)
##      name              chrom               start                end           
##  Length:524709      Length:524709      Min.   :     3000   Min.   :    47000  
##  Class :character   Class :character   1st Qu.: 33423000   1st Qu.: 33502000  
##  Mode  :character   Mode  :character   Median : 71502000   Median : 71566000  
##                                        Mean   : 80675269   Mean   : 80761676  
##                                        3rd Qu.:117715000   3rd Qu.:117799775  
##                                        Max.   :249134000   Max.   :249176000  
##      length             snps            pop              country         
##  Min.   :   1000   Min.   :  0.00   Length:524709      Length:524709     
##  1st Qu.:  29000   1st Qu.:  6.00   Class :character   Class :character  
##  Median :  53204   Median : 11.00   Mode  :character   Mode  :character  
##  Mean   :  87407   Mean   : 18.15                                        
##  3rd Qu.: 104000   3rd Qu.: 22.00                                        
##  Max.   :5932442   Max.   :468.00                                        
##     region             MeanProb      Shared_with_Altai Shared_with_Denisova
##  Length:524709      Min.   :0.5000   Min.   :  0.000   Min.   :  0.000     
##  Class :character   1st Qu.:0.7398   1st Qu.:  0.000   1st Qu.:  0.000     
##  Mode  :character   Median :0.8686   Median :  1.000   Median :  0.000     
##                     Mean   :0.8281   Mean   :  5.731   Mean   :  2.941     
##                     3rd Qu.:0.9396   3rd Qu.:  7.000   3rd Qu.:  3.000     
##                     Max.   :1.0000   Max.   :267.000   Max.   :188.000     
##  Shared_with_Vindija   outgroup            method         
##  Min.   :  0.000     Length:524709      Length:524709     
##  1st Qu.:  0.000     Class :character   Class :character  
##  Median :  1.000     Mode  :character   Mode  :character  
##  Mean   :  6.318                                          
##  3rd Qu.:  8.000                                          
##  Max.   :263.000

How many individuals do we have?

length(unique(df$name))
## [1] 358

How many populations?

length(unique(df$pop))
## [1] 110

What regions do we have?

unique(df$region)
## [1] "EastAsia"           "WestEurasia"        "CentralAsiaSiberia"
## [4] "SouthAsia"          "Melanesia"

Make a plot of the average archaic segment length (by population)

mean_seg_pop <- df %>% 
  group_by(pop, region) %>% 
  summarise(`Mean length` = mean(length))
## `summarise()` has grouped output by 'pop'. You can override using the `.groups`
## argument.
head(mean_seg_pop)
## # A tibble: 6 × 3
## # Groups:   pop [6]
##   pop       region             `Mean length`
##   <chr>     <chr>                      <dbl>
## 1 Abkhasian WestEurasia               73275.
## 2 Adygei    WestEurasia               75431.
## 3 Albanian  WestEurasia               78357.
## 4 Aleut     CentralAsiaSiberia        81293.
## 5 Altaian   CentralAsiaSiberia        83659.
## 6 Ami       EastAsia                  83183.
mean_seg_pop %>% 
  ungroup() %>% 
  arrange(region) %>% 
  mutate(pop = factor(pop, pop)) %>% 
  ggplot(aes(x = pop, y = `Mean length`, fill = region)) +
  geom_bar(position="dodge", stat="identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=9))

Who has the highest mean segment length?

mean_seg_pop[which.max(mean_seg_pop$`Mean length`),]
## # A tibble: 1 × 3
## # Groups:   pop [1]
##   pop     region    `Mean length`
##   <chr>   <chr>             <dbl>
## 1 Papuans Melanesia       101734.

Average length of segments by region?

mean_seg_reg <- df %>% 
  group_by(region) %>% 
  summarise(`Mean length` = mean(length))

mean_seg_reg %>% 
  ggplot(aes(x = region, y = `Mean length`)) +
  geom_col() +
  coord_flip()

Q1: what is the total length of archaic segments in each individual

total_seg_ind <- df %>% 
  group_by(name, pop, region) %>% 
  summarise(total_length = sum(length))
## `summarise()` has grouped output by 'name', 'pop'. You can override using the
## `.groups` argument.
total_seg_ind %>% 
  arrange(total_length) %>% 
  ggplot(aes(x=total_length, fill=region)) +
  geom_histogram(bins=100)

Q2: summarise per region and population

total_seg_ind %>% 
  group_by(region, pop) %>% 
  summarise(mean_total_length = mean(total_length)) %>% 
  ungroup() %>% 
  arrange(region) %>% 
  mutate(pop = factor(pop,pop)) %>% 
  ggplot(aes(x=pop, y=mean_total_length, fill=region)) +
  geom_bar(position='dodge', stat='identity') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

Q3: Who has the most archaic ancestry? Why?

  • Papuans have the most (Melanesia)
df %>% 
  pivot_longer(
    cols = starts_with("Shared_with_"),  # Pivot the shared columns
    names_to = "shared_with",  # Create a new column for group information
    values_to = "count"  # Create a new column for the y-values
  ) %>% 
  ggplot(aes(x=region, y=count, fill=shared_with)) +
  geom_bar(stat='identity', position='dodge')

Q4: What is the length distribution of fragments for the five different regions (hint: you can use facet_grid to plot all at once)

df %>% 
  ggplot(aes(length)) +
  geom_histogram(bins=30) +
  facet_wrap(~region, nrow=1) +
  xlim(0,1e+6) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle=90,hjust=1)
  ) +
  NULL
## Warning: Removed 511 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 10 rows containing missing values (`geom_bar()`).

df %>%
  group_by(region) %>%
  ggplot(aes(y=length,x=region)) + 
  theme_bw() +
  geom_boxplot() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Q5: What is the average length of segments by population and region sorted

mean_seg_pop %>% 
  ungroup() %>% 
  arrange(region) %>% 
  mutate(pop = factor(pop, pop)) %>% 
  ggplot(aes(x = pop, y = `Mean length`, fill = region)) +
  geom_bar(position="dodge", stat="identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=9))

Q6: What can cause different mean fragment lengths?

Differences in recombination rate, more than one wave of introgression in one of the regions. Recent gene flow/introgression from denisovans/neanderthal after migration out of Africa. Generation time.

Origin of fragments

snps <- read_tsv("ArchaicAdmixture/SNPs.txt")
## Rows: 11624912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): name, chrom, region, archaic, Denis
## dbl (1): start
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(snps)
## # A tibble: 6 × 6
##   name      chrom   start region   archaic        Denis  
##   <chr>     <chr>   <dbl> <chr>    <chr>          <chr>  
## 1 ERS699811 1     1562536 EastAsia human          _SubAfr
## 2 ERS699811 1     1563789 EastAsia human          _SubAfr
## 3 ERS699811 1     1564194 EastAsia human          _SubAfr
## 4 ERS699811 1     1601052 EastAsia human          _SubAfr
## 5 ERS699811 1     1603103 EastAsia human          _SubAfr
## 6 ERS699811 1     2792312 EastAsia AltaiNea_nofil _SubAfr
unique(snps$archaic)
## [1] "human"          "AltaiNea_nofil" "Vi33.19_nofil"  "Denisova_nofil"
unique(snps$Denis)
## [1] "_SubAfr"
nrow(snps)
## [1] 11624912

Q1: For each individual, assign the archaic segments to origin and reconstruct a Figure in the same style as Figure 5 of the Cell paper

df <- df %>%
  mutate(origin = case_when(
        Shared_with_Altai > Shared_with_Denisova | Shared_with_Vindija > Shared_with_Denisova ~ "Neanderthal",
        Shared_with_Altai < Shared_with_Denisova & Shared_with_Vindija < Shared_with_Denisova ~ "Denisovan",
        Shared_with_Altai + Shared_with_Vindija + Shared_with_Denisova == 0 ~ "Unclassified"
    ))


df %>% 
  group_by(name, origin, pop) %>% 
  summarise(total = sum(length)) %>% 
  ungroup() %>% 
  group_by(pop, origin) %>% 
  summarise(mean_seqs = mean(total)) %>% 
  ungroup() %>% 
  mutate(origin = factor(origin, c("Denisovan", "Unclassified", "Neanderthal"))) %>%
  ggplot(aes(x=pop, y=mean_seqs*10^-6, fill=origin)) +
  geom_bar(stat='identity') +
  coord_flip() +
  ylab("Mean detected archaic sequence per region (Mb)") +
  theme(axis.text.x = element_text(angle=90, hjust=1))
## `summarise()` has grouped output by 'name', 'origin'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'pop'. You can override using the `.groups`
## argument.

Q2. What are major differences? What can explain these differences?

There are a lot more archaic proportions on Papuans, especially from Denisovans. Differences in proportions and lengths between methods.

Q3: Summarize the results by averaging over region and plot these.

df %>% 
  group_by(name, origin, region) %>% 
  summarise(total = sum(length)) %>%
  ungroup() %>% 
  group_by(region, origin) %>% 
  summarise(mean_seqs = mean(total)) %>% 
  ungroup() %>% 
  mutate(origin = factor(origin, c("Denisovan", "Unclassified", "Neanderthal"))) %>%
  ggplot(aes(x=region, y=mean_seqs*10^-6, fill=origin)) +
  geom_bar(stat='identity') +
  coord_flip() +
  ylab("Mean detected archaic sequence per region (Mb)") +
  theme(axis.text.x = element_text(angle=90, hjust=1))
## `summarise()` has grouped output by 'name', 'origin'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

Q6. Determine the fragment length distribution of segment of Neanderthal and Denisova origin separately for each region. Compare the median of the distributions.

medians <- df %>% 
  filter(origin %in% c("Denisovan", "Neanderthal")) %>% 
  group_by(region, origin) %>% 
  summarise(median_length = (median(length)),
            count=n(),
            nsamples = length(unique(name)))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
df %>% filter(origin %in% c("Denisovan", "Neanderthal")) %>% 
  ggplot(aes(x=length, fill=origin)) +
  geom_density(alpha=0.2) +
  #geom_vline(aes(xintercept = median(length), color = origin)) +
  geom_vline(data=medians, aes(xintercept=median_length, color=origin)) +
  facet_wrap(~region, nrow=1) +
  scale_fill_manual(values = c("red", "blue")) + 
  xlim(0, 1e6) +
  theme(axis.text.x = element_text(angle = 90, hjust=1))
## Warning: Removed 509 rows containing non-finite values (`stat_density()`).

medians %>% knitr::kable()
region origin median_length count nsamples
CentralAsiaSiberia Denisovan 63000.0 1653 27
CentralAsiaSiberia Neanderthal 74000.0 16219 27
EastAsia Denisovan 60000.0 10907 132
EastAsia Neanderthal 72000.0 85361 132
Melanesia Denisovan 82000.0 73507 89
Melanesia Neanderthal 76944.5 81514 89
SouthAsia Denisovan 59000.0 2664 39
SouthAsia Neanderthal 69000.0 21786 39
WestEurasia Denisovan 51000.0 2369 71
WestEurasia Neanderthal 69000.0 33729 71

Do the quantiles

quantile((df %>% 
         filter(origin %in% c("Neanderthal","Denisovan")))$length)
##      0%     25%     50%     75%    100% 
##    1000   42000   74000  139000 4698985
quantile((df %>% 
         filter(origin %in% c("Neanderthal","Denisovan")) %>%
         mutate(total_snps = Shared_with_Altai + Shared_with_Denisova + Shared_with_Vindija))$total_snps)
##   0%  25%  50%  75% 100% 
##    1    6   14   29  561
# Use the quantiles (median) as the threshold

df %>%
    filter(origin %in% c("Neanderthal","Denisovan")) %>%
    mutate(total_snps = Shared_with_Altai + Shared_with_Denisova + Shared_with_Vindija) %>%
    filter(total_snps > 19) %>%
    ggplot() +
    geom_density(aes(length, fill=origin), color=NA,alpha=0.5) +
    theme_bw() + 
    facet_grid(~region) +
    xlim(0,2000000) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).

df %>% 
    filter(origin %in% c("Neanderthal","Denisovan")) %>%
    mutate(total_snps = Shared_with_Altai + Shared_with_Denisova + Shared_with_Vindija) %>%
    filter(total_snps > 50) %>%
    ggplot(aes(x=origin,y=length)) +
        geom_boxplot() +
        ggtitle('Denisovan fragment length per region') +
        theme_bw() + 
        facet_grid(~region) +
        scale_x_discrete() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1))

What about the distribution of unclassified snps

df  %>%
    filter(origin == "Neanderthal") %>%
    group_by(region) %>%
    ggplot(aes(length)) +
        geom_histogram(bins = 50) +
        ggtitle('Neanderthal fragment length per region') +
        theme_bw() + facet_grid(~region) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) -> plot1

df  %>%
    filter(origin == "Denisovan") %>%
    ggplot(aes(length)) +
        geom_histogram(bins = 50) +
        ggtitle('Denisovan fragment length per region') +
        theme_bw() + facet_grid(~region)  +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) -> plot2

df  %>%
    filter(origin == "Unclassified") %>%
    group_by(region) %>%
    ggplot(aes(length)) +
        geom_histogram(bins = 50) +
        ggtitle('Unclassified fragment length per region') +
        theme_bw() + facet_grid(~region) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) -> plot3

plot_grid(plot1, plot2, plot3, ncol=1)

Comparison of chromosomes

Q1. Determine the amount of archaic introgression on each chromosome for each of the five regions.

df  %>%
    group_by(chrom, region) %>%
    summarise(Frequency = sum(as.numeric(length))) %>%
    ggplot(aes(x = chrom, y = Frequency, fill = region)) +  
    geom_bar(position = "dodge", stat="identity") +
    theme_bw() +
    scale_x_discrete(limits= c(seq(1, 22, 1),'X')) + # ordering from 1 to 22 +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.

Q2. Repeat this with assignment of regions to archaic species.

df  %>%
    group_by(chrom, origin) %>%
    summarise(Frequency = sum(as.numeric(length))) %>%
    ggplot(aes(x = chrom, y = Frequency, fill = origin)) +  
        geom_bar(position = "dodge", stat="identity") +
        theme_bw() +
        scale_x_discrete(limits= c(seq(1, 22, 1),'X')) + # ordering from 1 to 22
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.

Q3. Combine the Neanderthal fragments for all individuals and plot all the fragments on top of each other along chromosomes (hint use alpha = 0.1). Can you find “deserts” of archaic admixture and/or evidence for places where Neanderthal ancestry has reached very high frequency?

chrom <- factor(c(1:22, 'X'), levels=c(1:22,'X'))
chrom_len <- c(2.50e+08, 2.45e+08, 1.98e+08, 1.95e+08, 1.82e+08, 
               1.71e+08, 1.60e+08, 1.47e+08, 1.42e+08, 1.36e+08, 
               1.35e+08, 1.35e+08, 1.16e+08, 1.08e+08, 1.03e+08, 
               9.03e+07, 8.33e+07, 8.04e+07, 6.00e+07, 6.44e+07, 
               4.90e+07, 5.15e+07, 1.55e+08)


df %>%
  mutate(chrom = factor(chrom, levels = c(1:22, 'X'))) %>% 
  arrange(region) %>%
  ggplot() +
  theme_bw() + 
  geom_segment(aes(x=start,xend=end,y=0, yend=1, col=origin), alpha=0.02) + 
  geom_blank() +
  geom_rect(data=tibble(chrom, chrom_len), aes(xmin=chrom_len, xmax=Inf, ymin=0, ymax=1), fill="grey") +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  facet_wrap(~chrom) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),  
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.y=element_blank(),
        legend.position="none") +
  xlab("Position")

Q4. You will find that the X chromosome is an outlier (compare to a chormosome of a similar size - chr8). How and why?

# Histogram
df %>% 
  group_by(chrom, region) %>% 
  filter(chrom %in% c('8', 'X')) %>% 
  summarise(Frequency = sum(as.numeric(length))) %>% 
  ggplot(aes(x = chrom, y=Frequency, fill = region)) +
  geom_bar(position='dodge', stat='identity') +
  theme_bw() +
  theme(axis.text.x = element_text(angle=90, hjust=1)) 
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.

# Boxplot
df %>%
  group_by(chrom, region) %>%
  filter(chrom %in% c('8', 'X')) %>%
  summarise(Frequency = sum(as.numeric(length))) %>%
  ggplot(aes(x = chrom, y = Frequency)) +  
  geom_boxplot() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.

  • There are much less archaic SNPS on X chromosome, possibly due to sexual antagonistic influence, or hybrid incompatibility ??

Q6. Do you find regions that are devoid of introgression for both the Neanderthal and the Denisovan admixture events?

Let us look at the EPAS1 as an example of adaptive introgression. This gene maps to chr2:46,520,806-46,613,842 in GRCh37 coordinates. Tibetan individuals have an increased amount of archaic polymorphism in this region.

df %>% 
  filter(origin %in% c("Neanderthal", "Denisovan")) %>% 
  filter(chrom == 2 & region == 'EastAsia' & start > 4.65e+07 & end < 4.67e+07) %>% 
  ggplot(aes(x=length, fill=origin)) +
  geom_histogram(bins=50) +
  facet_wrap(~country, nrow = 1) +
  theme(axis.text.x = element_text(angle=90, hjust=1))

Contrary, FOXP2 is an example of maladaptive introgression, consistent with the presence of deserts of archaic origin in around this gene.

df %>%
  filter(chrom == "7") %>%
  arrange(region) %>%
  ggplot() +
  theme_bw() + 
  geom_segment(aes(x=start,xend=end,y=0, yend=1, col=origin), alpha=0.02) + 
  facet_wrap(~region, ncol=1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),  
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.y=element_blank(),
        legend.position="none") +
  geom_vline(xintercept = 113726365, alpha=0.5) +
  xlab("Position") + 
  ggtitle(paste("Chromosome 7")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

The end